Create a new file through File -> New File -> Rmarkdown. Prefer choosing HTML as the default output format. You can not start a new line without adding an extra blank line above.
Start a new line like this.
Or you could add 2 or more spaces at the end of the last line to start a
new line like this.
Press “Knit” button to see how this page is rendered. If you have LaTeX compiler on your computer, you may change the “output” to be “pdf_document” so that Rstudio will generate a pdf file using LaTeX.
For LaTeX equations, you may add an inline equation like this: \(y = \beta_0+\beta_1x\), and add a display equation like this: \[y = \beta_0+\beta_1x\].
Insert images like this:
You can find many helpful markdown reference here. Make sure that the image you insert should be in the folder where you save this .Rmd file. Otherwise, use absolute path.
# This is a R code chunk. Make sure you have ```{r} at the top of the chunk and ``` at the bottom.
# install.packages("ggplot2")
library(ggplot2) # Import library like this. If you don't have this package yet, install it using the code above. You can directly type this command in the Console below. Click "Packages" on your right panel to see what packages you've already installed.
## Warning: 程辑包'ggplot2'是用R版本4.2.2 来建造的
# Run the code in the current chunk by clicking the green triangle at the top-right corner of this chuck.
# If you want to run only part of codes within a chunk(in the case where you don't want to reload the data again), select the corresponding code lines first, and then press Ctrl+Enter.
# Create a numeric vector
vec = c(5, 3, 1, 6, 4, 2, 7, 9) # Basically no difference between "=" and "<-"
vec[1] # No need to use print()
## [1] 5
vec[1:3] # First 3 elements
## [1] 5 3 1
vec[c(2,4)] # The 2nd and 4th element of vec: 3, 6
## [1] 3 6
# DIFFERENT FROM PYTHON!!!
vec[c(-2, -4)] # Not choose the 2nd and 4th element of vec: 5, 1, 4, 2
## [1] 5 1 4 2 7 9
len = length(vec) # The length of vec
vec[len] # The last element of vec
## [1] 9
# So far there are 2 variables created. You can find their info in the "Environment" on the right panel.
# Other ways to create vectors
v1 = c(1:8)
v1
## [1] 1 2 3 4 5 6 7 8
v2 = seq(from = 3, to = 21, by =2)
v2
## [1] 3 5 7 9 11 13 15 17 19 21
v3 = rep(5, 4) # rep(value, repeat_times)
v3
## [1] 5 5 5 5
v4 = rep(v1, 2)
v4
## [1] 1 2 3 4 5 6 7 8 1 2 3 4 5 6 7 8
v1 + vec # Add operation
## [1] 6 5 4 10 9 8 14 17
c(v1, vec) # Concatenation
## [1] 1 2 3 4 5 6 7 8 5 3 1 6 4 2 7 9
v1 = c(v1, 100) # Append
v5 = c(1:6)
m1 = matrix(v5, nrow=2, byrow=TRUE)
m1
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 4 5 6
m2 = matrix(v5, nrow=2, byrow=FALSE)
m2
## [,1] [,2] [,3]
## [1,] 1 3 5
## [2,] 2 4 6
rowname = c("r1", "r2")
colname = c("c1", "c2", "c3")
m3 = matrix(v5, nrow=2, dimnames = list(rowname, colname)) # Default: byrow=FALSE
m3
## c1 c2 c3
## r1 1 3 5
## r2 2 4 6
t(m3) # Transform
## r1 r2
## c1 1 2
## c2 3 4
## c3 5 6
m3[1, 2] # [row, col]
## [1] 3
m3[1,] # First row
## c1 c2 c3
## 1 3 5
m3[,2] # Second col
## r1 r2
## 3 4
m1*m3 # Element-by-element multiplication
## c1 c2 c3
## r1 1 6 15
## r2 8 20 36
m1 %*% t(m3) # Matrix multiplication
## r1 r2
## [1,] 22 28
## [2,] 49 64
m4 = cbind(v1, vec) # Column bind
## Warning in cbind(v1, vec): number of rows of result is not a multiple of vector
## length (arg 2)
m4
## v1 vec
## [1,] 1 5
## [2,] 2 3
## [3,] 3 1
## [4,] 4 6
## [5,] 5 4
## [6,] 6 2
## [7,] 7 7
## [8,] 8 9
## [9,] 100 5
m5 = rbind(v1, vec) # Row bind
## Warning in rbind(v1, vec): number of columns of result is not a multiple of
## vector length (arg 2)
m5
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## v1 1 2 3 4 5 6 7 8 100
## vec 5 3 1 6 4 2 7 9 5
Data.frame is a very special data structure in R.
data(iris) # Load data from R
dim(iris) # row, col. Could also be accessed by using nrow() and ncol() respectively.
## [1] 150 5
head(iris) # View the data of the top 6 rows of the dataset
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
tail(iris, 3) # View the last 3 rows of the dataset
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 148 6.5 3.0 5.2 2.0 virginica
## 149 6.2 3.4 5.4 2.3 virginica
## 150 5.9 3.0 5.1 1.8 virginica
names(iris) # The names of each column
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
vec_species = iris$Species # Access the data of column "Species" by using $
vec_species[1] # Treat it as a vector
## [1] setosa
## Levels: setosa versicolor virginica
str(iris) #Structure
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
table(iris$Species)
##
## setosa versicolor virginica
## 50 50 50
unique(iris$Species)
## [1] setosa versicolor virginica
## Levels: setosa versicolor virginica
View(iris) # View the data in Rstudio. You could also double click this data in the "Environment" on the right panel.
If you want to get subdataset with Species=‘setosa’, try which function.
id = which(iris$Species=='setosa') # Return a vector of row index
iris_setosa = iris[id,]
iris_setosa = iris_setosa[,-5] # drop the 5th col
head(iris_setosa)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
The elements can be called as usually in matrix or by attribute name.
iris[3,4]
## [1] 0.2
iris$Petal.Length[4]
## [1] 1.5
Basic operation functions.
max(iris$Sepal.Length)
## [1] 7.9
min(iris$Sepal.Length)
## [1] 4.3
which.max(iris$Sepal.Length)
## [1] 132
mean(iris$Sepal.Length)
## [1] 5.843333
median(iris$Sepal.Length)
## [1] 5.8
sd(iris$Sepal.Length)
## [1] 0.8280661
Load data from .csv file and create a new data.frame.
data = read.csv("fuel2001.csv") # Load data
head(data)
## X Drivers FuelC Income Miles MPC Pop Tax
## 1 AL 3559897 2382507 23471 94440 12737.00 3451586 18.0
## 2 AK 472211 235400 30064 13628 7639.16 457728 8.0
## 3 AZ 3550367 2428430 25578 55245 9411.55 3907526 18.0
## 4 AR 1961883 1358174 22257 98132 11268.40 2072622 21.7
## 5 CA 21623793 14691753 32275 168771 8923.89 25599275 18.0
## 6 CO 3287922 2048664 32949 85854 9722.73 3322455 22.0
Fuel = 1000*data$FuelC/data$Pop
Dlic = 1000*data$Drivers/data$Pop
newdata = data.frame(Fuel, data$Tax, Dlic, data$Income, log(data$Miles)) # Create a new data.frame
newdata$MPC = data$MPC # Append a new column
head(newdata)
## Fuel data.Tax Dlic data.Income log.data.Miles. MPC
## 1 690.2644 18.0 1031.3801 23471 11.455720 12737.00
## 2 514.2792 8.0 1031.6411 30064 9.519882 7639.16
## 3 621.4751 18.0 908.5972 25578 10.919533 9411.55
## 4 655.2927 21.7 946.5706 22257 11.494069 11268.40
## 5 573.9129 18.0 844.7033 32275 12.036298 8923.89
## 6 616.6115 22.0 989.6062 32949 11.360403 9722.73
R is very convenient to generate figures. Here are some basic graphics functions in R.
pairs(iris) # Pairwise scatterplot
pairs(iris[1:3]) # Only use first 3 cols
pairs(~iris$Sepal.Length + iris$Petal.Length + iris$Petal.Width) # Choose specific cols
par(mfrow=c(2,2)) # Let the following 4 plots be displayed together within a 2*2 block
hist(iris$Sepal.Length)
boxplot(iris$Sepal.Length)
boxplot(iris)
plot(iris$Sepal.Length, iris$Petal.Length, xlab = "this is the x-axis",
ylab = "this is the y-axis",
main = "Plot of X vs Y")
Functions mentioned above could handle most of the graph-plotting problems in your homeworks. The professor also introduced how to use ggplot2 in the lecture and gave out its document link. You can learn it by yourself. At the end of this page, I list some of its usage that is introduced in UM’s lab for your reference.
set.seed(1) # Set seed for random
for(i in c(1:3)){
x1 = runif (100, min=-1, max=1) # runif() creates 100 samples from uniform distribution between range -1 to 1
x2 = i * x1 + rnorm (100, mean=0, sd=1) / 10 # rnorm() create from normal distribution with mean=0 and standard_deviation=1
y = 2 + 2 * x1 + 0.3 * x2 + rnorm (100)
if(i != 2){
plot(y, x1)
abline(a = -0.5, b=1/4) # a is intercept, b is slope
# https://www.rdocumentation.org/packages/graphics/versions/3.6.2/topics/abline
}
}
The library ggplot2 offers a plotting system with substantial capabilities and nicer graphics than the base plotting functions.
# install.packages("ggplot2")
# if you don't have this package, you need to install it.
library(ggplot2)
Building blocks of a graph include: data, aesthetic mapping, geometric object, statistical transformations, scales, coordinate system, position adjustments and faceting.
Aesthetic Mapping: something you can see, for example, position, color, fill, shape, linetype, size and so on. Aesthetic mappings are set with the aes() function.
Geometic Objects: the actual marks we put on a plot, for example, points(geom_point), lines(geom_line), histograms(geom_histogram).
#load data
data_crime = read.table("city_crime.txt", header = TRUE)
head(data_crime)
## Murder Rape Robbery Assault Burglary Larceny MVT
## New-York 21.3 36.3 988.8 814.5 1204.6 2859.9 1300.7
## LA 23.8 43.8 868.0 1123.4 1226.2 3120.5 1434.3
## Chicago 33.1 40.0 1210.5 1440.9 1563.6 4323.4 1427.9
## Houston 21.3 53.0 567.7 665.5 1451.5 3239.2 1287.2
## Philly 25.9 46.2 814.2 436.2 903.9 2588.3 1620.0
## San-Diego 9.7 34.5 329.0 704.8 1102.8 3012.0 1371.8
qplot() is a function which stands for quick plot. It has a lot of syntax which is similar to plot().
plot(x = data_crime$Robbery, y = data_crime$Assault, xlab = 'Robbery', ylab = 'Assault', main = 'Crime')
qplot(x = Robbery, y = Assault, data = data_crime, xlab = 'Robbery', ylab = 'Assault', main = 'Crime')
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
A plot must have at least one geom.
crime_plot <- ggplot(data = data_crime,aes(x = Robbery, y = Assault))
# scatterplot by using geom_point()
crime_plot + geom_point()
# lines
crime_plot + geom_line()
#color and size
crime_plot + geom_point(aes(color = Murder, size = Larceny))
crime_plot + geom_point(aes(color = Murder)) +
scale_colour_gradient(high = "red", low = "blue") + theme_bw()
#text
crime_plot + geom_text(aes(label=rownames(data_crime)), check_overlap = TRUE)
After creating your basic graph, what if you want to add a title, adjust the size of text?
crime_plot <- ggplot(data = data_crime) + geom_point(aes(x = Robbery, y = Assault, color = Murder))
crime_plot <- crime_plot + labs(title = "Crime", x = 'Robbery', y = 'Assualt')
crime_plot + theme(text = element_text(size=20))
# just change size of title
crime_plot + theme(plot.title = element_text(size = 20))
# annotate our graph
crime_plot + geom_abline(intercept = 1500, slope = -0.5) +
geom_text(aes(x = c(1500), y = c(800), label = c('separating hyperplane')), size = 5)
Pairs plots are useful to visualize multiple variables at the same time. The base function is pairs, but here is an example with ggplot.
#install.packages('GGally')
library(GGally)
crime_pairs = ggpairs(data_crime,axisLabels = "none",
upper = list(continuous = "points", combo = "dot"),
lower = list(continuous = "cor", combo = "dot"),
diag = list(continuous = "densityDiag")) +
theme_bw()
crime_pairs
Sometimes it is useful to create 3D plots to visualize multivariate data. The library plotly can do so.
#install.packages('plotly')
library(plotly)
## Warning: 程辑包'plotly'是用R版本4.2.2 来建造的
p = plot_ly(data_crime, x = ~Robbery, y = ~Assault, z = ~Murder,color=~Larceny)
p